#Install packages:
#Source: https://trinkerrstuff.wordpress.com/my-r-packages/qdap/
#if (!require("pacman")) install.packages("pacman")
#pacman::p_load(sentimentr, dplyr, magrittr)
#install.packages("devtools")
#install_github("trinker/qdapDictionaries")
#install_github("trinker/qdapRegex")
#install_github("trinker/qdapTools")
#install_github("trinker/qdap")
#install.packages("quanteda")
#install.packages("sentimentr")
#install.packages("ndjson")
#install.packages("NLP")
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("tm")
#install.packages("corpus")
#install.packages("syuzhet")
#install.packages("plotly")
#install.packages("wordcloud")
library(devtools)
## Loading required package: usethis
library(tm)
## Loading required package: NLP
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## Loading required package: RColorBrewer
##
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following objects are masked from 'package:base':
##
## Filter, proportions
library(sentimentr)
## Registered S3 methods overwritten by 'textclean':
## method from
## print.check_text qdap
## print.sub_holder qdap
library(ndjson)
##
## Attaching package: 'ndjson'
## The following object is masked from 'package:qdapRegex':
##
## validate
library(corpus)
library(syuzhet)
##
## Attaching package: 'syuzhet'
## The following object is masked from 'package:sentimentr':
##
## get_sentences
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdapTools':
##
## id
## The following object is masked from 'package:qdapRegex':
##
## explain
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(quanteda)
## Package version: 3.1.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
##
## stopwords
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:qdapRegex':
##
## %+%
## The following object is masked from 'package:NLP':
##
## annotate
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:sentimentr':
##
## highlight
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(wordcloud)
#a good package, also takes into account negative words and amplifiers
#see: http://www.inside-r.org/packages/cran/qdap/docs/polarity
#getwd()
#setwd("C:/Ryerson University - Capstone project/Module 2/EIEEE - Large dataset/Combined")
#Read in original data set May 2020
data_set_may <- read.csv("corona_tweets_59 May 2020", header = T, sep = ",")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## embedded nul(s) found in input
#take a sample of 1,000, set seed to replicate results across several analysis of methods:
set.seed(1000)
rawData <- data_set_may[sample(nrow(data_set_may), size = 1000), ]
#write.csv(rawData,'rawData.csv')
str(rawData)
## 'data.frame': 1000 obs. of 35 variables:
## $ coordinates : chr "" "" "" "" ...
## $ created_at : chr "Sat May 16 23:31:16 +0000 2020" "Sat May 16 18:57:19 +0000 2020" "Sun May 17 02:30:46 +0000 2020" "Sat May 16 23:33:45 +0000 2020" ...
## $ hashtags : chr "" "" "" "" ...
## $ media : chr "" "" "" "" ...
## $ urls : chr "" "" "" "https://www.nbcnews.com/now/video/officials-warn-chinese-hackers-are-targeting-u-s-coronavirus-research-83422277503" ...
## $ favorite_count : int 0 0 0 0 0 0 0 0 1 1 ...
## $ id : num 1.26e+18 1.26e+18 1.26e+18 1.26e+18 1.26e+18 ...
## $ in_reply_to_screen_name : chr "" "" "" "" ...
## $ in_reply_to_status_id : num NA NA NA NA NA NA NA NA NA NA ...
## $ in_reply_to_user_id : num NA NA NA NA NA NA NA NA NA NA ...
## $ lang : chr "en" "en" "en" "en" ...
## $ place : chr "" "" "" "" ...
## $ possibly_sensitive : chr "" "" "" "false" ...
## $ quote_id : num NA NA NA NA 1.26e+18 ...
## $ retweet_count : int 25 338 441 0 0 12022 4 11 1 0 ...
## $ retweet_id : num 1.26e+18 1.26e+18 1.26e+18 NA NA ...
## $ retweet_screen_name : chr "business" "Suewilson91" "BreitbartNews" "" ...
## $ source : chr "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://www.echofon.com/\" rel=\"nofollow\">Echofon</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
## $ text : chr "Many Americans have proven diligent in staying home to limit the spread of Covid-19. But their acceptance of so"| __truncated__ "Matt Hancock accused of being a 'liar' and told to resign after claiming he 'protected' care homes from the sta"| __truncated__ "Secretary of State @MikePompeo told Breitbart News that President Donald Trump is â\200œcommittedâ\200\235 to h"| __truncated__ "Officials warn Chinese hackers are targeting U.S. coronavirus research https://t.co/7tnGhf85MS via @nbcnews" ...
## $ tweet_url : chr "https://twitter.com/lemnosalt/status/1261801422430978048" "https://twitter.com/Hilary72926522/status/1261732478529740807" "https://twitter.com/BillSpears724/status/1261846593453637632" "https://twitter.com/Bet_the_ChE/status/1261802044354113536" ...
## $ user_created_at : chr "Tue Feb 10 00:25:20 +0000 2009" "Sun Dec 01 15:12:16 +0000 2019" "Fri Jan 06 19:30:57 +0000 2017" "Sun Aug 11 19:49:05 +0000 2013" ...
## $ user_id : num 2.05e+07 1.20e+18 8.17e+17 1.66e+09 1.26e+18 ...
## $ user_default_profile_image: chr "false" "false" "false" "false" ...
## $ user_description : chr "Groovy chick and media producer. All snark. No bite." "" "" "Just some engineer." ...
## $ user_favourites_count : int 92045 19675 1 46635 2788 1371 1230 18960 4 34505 ...
## $ user_followers_count : int 1469 45 65 263 426 97 109 2151 375 12607 ...
## $ user_friends_count : int 2526 229 228 1960 267 240 274 4846 227 12722 ...
## $ user_listed_count : int 73 0 1 1 4 1 0 15 13 106 ...
## $ user_location : chr "" "New Forest" "" "United States" ...
## $ user_name : chr "Lynn" "Hilary ðŸ’\231" "Bill Spears" "Bet" ...
## $ user_screen_name : chr "lemnosalt" "Hilary72926522" "BillSpears724" "Bet_the_ChE" ...
## $ user_statuses_count : int 35678 5272 24796 23697 1028 317 279 84594 14606 252203 ...
## $ user_time_zone : logi NA NA NA NA NA NA ...
## $ user_urls : chr "http://lynnmargherita.com" "" "" "" ...
## $ user_verified : chr "false" "false" "false" "false" ...
#create a corpus:
importdocs = corpus(rawData, text_field = 'text')
#preprocessing of data
importdocs <- gsub("'", "", importdocs) # remove apostrophes
importdocs <- gsub("[[:punct:]]", " ", importdocs) # replace punctuation with space
importdocs <- gsub("[[:cntrl:]]", " ", importdocs) # replace control characters with space
importdocs <- gsub("^[[:space:]]+", "", importdocs) # remove whitespace at beginning of documents
importdocs <- gsub("[[:space:]]+$", "", importdocs) # remove whitespace at end of documents
importdocs <- tolower(importdocs)
# CLEANING TWEETS
importdocs=gsub("&", "", importdocs)
importdocs = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", importdocs)
importdocs = gsub("@\\w+", "", importdocs)
importdocs = gsub("[[:digit:]]", "", importdocs)
importdocs = gsub("http\\w+", "", importdocs)
importdocs = gsub("[ \t]{2,}", "", importdocs)
importdocs = gsub("^\\s+|\\s+$", "", importdocs)
importdocs <- iconv(importdocs, "UTF-8", "ASCII", sub="")
str(importdocs)
## 'corpus' Named chr [1:1000] "many americans have proven diligent in staying home to limit the spread of covidbut their acceptance of social "| __truncated__ ...
## - attr(*, "names")= chr [1:1000] "text1" "text2" "text3" "text4" ...
## - attr(*, "docvars")='data.frame': 1000 obs. of 37 variables:
## ..$ docname_ : chr [1:1000] "text1" "text2" "text3" "text4" ...
## ..$ docid_ : Factor w/ 1000 levels "text1","text2",..: 1 2 3 4 5 6 7 8 9 10 ...
## ..$ segid_ : int [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ coordinates : chr [1:1000] "" "" "" "" ...
## ..$ created_at : chr [1:1000] "Sat May 16 23:31:16 +0000 2020" "Sat May 16 18:57:19 +0000 2020" "Sun May 17 02:30:46 +0000 2020" "Sat May 16 23:33:45 +0000 2020" ...
## ..$ hashtags : chr [1:1000] "" "" "" "" ...
## ..$ media : chr [1:1000] "" "" "" "" ...
## ..$ urls : chr [1:1000] "" "" "" "https://www.nbcnews.com/now/video/officials-warn-chinese-hackers-are-targeting-u-s-coronavirus-research-83422277503" ...
## ..$ favorite_count : int [1:1000] 0 0 0 0 0 0 0 0 1 1 ...
## ..$ id : num [1:1000] 1.26e+18 1.26e+18 1.26e+18 1.26e+18 1.26e+18 ...
## ..$ in_reply_to_screen_name : chr [1:1000] "" "" "" "" ...
## ..$ in_reply_to_status_id : num [1:1000] NA NA NA NA NA NA NA NA NA NA ...
## ..$ in_reply_to_user_id : num [1:1000] NA NA NA NA NA NA NA NA NA NA ...
## ..$ lang : chr [1:1000] "en" "en" "en" "en" ...
## ..$ place : chr [1:1000] "" "" "" "" ...
## ..$ possibly_sensitive : chr [1:1000] "" "" "" "false" ...
## ..$ quote_id : num [1:1000] NA NA NA NA 1.26e+18 ...
## ..$ retweet_count : int [1:1000] 25 338 441 0 0 12022 4 11 1 0 ...
## ..$ retweet_id : num [1:1000] 1.26e+18 1.26e+18 1.26e+18 NA NA ...
## ..$ retweet_screen_name : chr [1:1000] "business" "Suewilson91" "BreitbartNews" "" ...
## ..$ source : chr [1:1000] "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://www.echofon.com/\" rel=\"nofollow\">Echofon</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
## ..$ tweet_url : chr [1:1000] "https://twitter.com/lemnosalt/status/1261801422430978048" "https://twitter.com/Hilary72926522/status/1261732478529740807" "https://twitter.com/BillSpears724/status/1261846593453637632" "https://twitter.com/Bet_the_ChE/status/1261802044354113536" ...
## ..$ user_created_at : chr [1:1000] "Tue Feb 10 00:25:20 +0000 2009" "Sun Dec 01 15:12:16 +0000 2019" "Fri Jan 06 19:30:57 +0000 2017" "Sun Aug 11 19:49:05 +0000 2013" ...
## ..$ user_id : num [1:1000] 2.05e+07 1.20e+18 8.17e+17 1.66e+09 1.26e+18 ...
## ..$ user_default_profile_image: chr [1:1000] "false" "false" "false" "false" ...
## ..$ user_description : chr [1:1000] "Groovy chick and media producer. All snark. No bite." "" "" "Just some engineer." ...
## ..$ user_favourites_count : int [1:1000] 92045 19675 1 46635 2788 1371 1230 18960 4 34505 ...
## ..$ user_followers_count : int [1:1000] 1469 45 65 263 426 97 109 2151 375 12607 ...
## ..$ user_friends_count : int [1:1000] 2526 229 228 1960 267 240 274 4846 227 12722 ...
## ..$ user_listed_count : int [1:1000] 73 0 1 1 4 1 0 15 13 106 ...
## ..$ user_location : chr [1:1000] "" "New Forest" "" "United States" ...
## ..$ user_name : chr [1:1000] "Lynn" "Hilary ðŸ’\231" "Bill Spears" "Bet" ...
## ..$ user_screen_name : chr [1:1000] "lemnosalt" "Hilary72926522" "BillSpears724" "Bet_the_ChE" ...
## ..$ user_statuses_count : int [1:1000] 35678 5272 24796 23697 1028 317 279 84594 14606 252203 ...
## ..$ user_time_zone : logi [1:1000] NA NA NA NA NA NA ...
## ..$ user_urls : chr [1:1000] "http://lynnmargherita.com" "" "" "" ...
## ..$ user_verified : chr [1:1000] "false" "false" "false" "false" ...
## - attr(*, "meta")=List of 3
## ..$ system:List of 6
## .. ..$ package-version:Classes 'package_version', 'numeric_version' hidden list of 1
## .. .. ..$ : int [1:3] 3 1 0
## .. ..$ r-version :Classes 'R_system_version', 'package_version', 'numeric_version' hidden list of 1
## .. .. ..$ : int [1:3] 4 1 1
## .. ..$ system : Named chr [1:3] "Windows" "x86-64" "jbloos"
## .. .. ..- attr(*, "names")= chr [1:3] "sysname" "machine" "user"
## .. ..$ directory : chr "C:/Ryerson University - Capstone project/Module 2/EIEEE - Large dataset/Combined"
## .. ..$ created : Date[1:1], format: "2021-11-08"
## .. ..$ source : chr "data.frame"
## ..$ object:List of 2
## .. ..$ unit : chr "documents"
## .. ..$ summary:List of 2
## .. .. ..$ hash: chr(0)
## .. .. ..$ data: NULL
## ..$ user : list()
mycorpus <- get_sentences(importdocs)
mysentiment <- sentiment(mycorpus)
mysentiment
## element_id sentence_id word_count sentiment
## 1: 1 1 25 0.310000000
## 2: 2 1 32 0.008838835
## 3: 3 1 38 -0.129777137
## 4: 4 1 12 -0.216506351
## 5: 5 1 19 0.149120227
## ---
## 996: 996 1 38 0.129777137
## 997: 997 1 15 0.232379001
## 998: 998 1 7 0.377964473
## 999: 999 1 41 -0.312347524
## 1000: 1000 1 14 0.000000000
# run overall score, result overall neutral to perhaps moderate positive
summary(mysentiment$sentiment)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.37620 -0.13606 0.00000 0.01764 0.18415 0.92095
#results expressed in histogram
qplot(mysentiment$sentiment, geom="histogram",binwidth=0.1,main="Review Sentiment Histogram")

#source: https://www.programmingr.com/sentiment-analysis/
#returns the individual words along with their polarity strength and counts.
t = extract_sentiment_terms(mycorpus)
attributes(t)$count
## words polarity n
## 1: care 1.00 30
## 2: please 1.00 13
## 3: understand 1.00 8
## 4: top 1.00 7
## 5: truth 1.00 5
## ---
## 7829: would have -1.05 3
## 7830: could have -1.05 3
## 7831: should have -1.05 2
## 7832: too many -2.00 6
## 7833: too much -2.00 1
#show positive and negative word use:
head(t,20)
## element_id sentence_id negative positive
## 1: 1 1 limit proven,diligent,acceptance
## 2: 2 1 accused,liar,resign protected,care,league
## 3: 3 1 trump,communist,pandemic accountable
## 4: 4 1 warn
## 5: 5 1 like,good
## 6: 6 1 deny
## 7: 7 1 obtaining,results,art
## 8: 8 1 bad
## 9: 9 1
## 10: 10 1 threaten
## 11: 11 1 cut work,freedom
## 12: 12 1 safe
## 13: 13 1
## 14: 14 1 death,death content,measured
## 15: 15 1
## 16: 16 1
## 17: 17 1 confirmed,positive
## 18: 18 1 good
## 19: 19 1 care
## 20: 20 1 ignorant flatter,flatter
# The emotion() function returns the rate of emotion per sentence. A data frame is returned by this function and of interest to us are the two columns: emotion type and emotion. Emotion indicates the strength of emotion present in the sentence.
emotion(mycorpus[1:2])
## element_id sentence_id word_count emotion_type emotion_count
## 1: 1 1 25 anger 0
## 2: 1 1 25 disgust 0
## 3: 1 1 25 fear 0
## 4: 1 1 25 sadness 0
## 5: 1 1 25 trust 1
## 6: 1 1 25 anger_negated 0
## 7: 1 1 25 anticipation 0
## 8: 1 1 25 anticipation_negated 0
## 9: 1 1 25 disgust_negated 0
## 10: 1 1 25 fear_negated 0
## 11: 1 1 25 joy 0
## 12: 1 1 25 joy_negated 0
## 13: 1 1 25 sadness_negated 0
## 14: 1 1 25 surprise 0
## 15: 1 1 25 surprise_negated 0
## 16: 1 1 25 trust_negated 0
## 17: 2 1 32 anger 2
## 18: 2 1 32 disgust 2
## 19: 2 1 32 fear 2
## 20: 2 1 32 sadness 1
## 21: 2 1 32 trust 1
## 22: 2 1 32 anger_negated 0
## 23: 2 1 32 anticipation 0
## 24: 2 1 32 anticipation_negated 0
## 25: 2 1 32 disgust_negated 0
## 26: 2 1 32 fear_negated 0
## 27: 2 1 32 joy 0
## 28: 2 1 32 joy_negated 0
## 29: 2 1 32 sadness_negated 0
## 30: 2 1 32 surprise 0
## 31: 2 1 32 surprise_negated 0
## 32: 2 1 32 trust_negated 0
## element_id sentence_id word_count emotion_type emotion_count
## emotion
## 1: 0.00000
## 2: 0.00000
## 3: 0.00000
## 4: 0.00000
## 5: 0.04000
## 6: 0.00000
## 7: 0.00000
## 8: 0.00000
## 9: 0.00000
## 10: 0.00000
## 11: 0.00000
## 12: 0.00000
## 13: 0.00000
## 14: 0.00000
## 15: 0.00000
## 16: 0.00000
## 17: 0.06250
## 18: 0.06250
## 19: 0.06250
## 20: 0.03125
## 21: 0.03125
## 22: 0.00000
## 23: 0.00000
## 24: 0.00000
## 25: 0.00000
## 26: 0.00000
## 27: 0.00000
## 28: 0.00000
## 29: 0.00000
## 30: 0.00000
## 31: 0.00000
## 32: 0.00000
## emotion
# graph with emotional valence, what is explanation. Note to self: look up
plot(mysentiment)

#integrate sentiment score into updated dataset
sentimentResultMay2020 <- rawData
sentimentResultMay2020$sentiment_score = mysentiment$sentiment
str(sentimentResultMay2020)
## 'data.frame': 1000 obs. of 36 variables:
## $ coordinates : chr "" "" "" "" ...
## $ created_at : chr "Sat May 16 23:31:16 +0000 2020" "Sat May 16 18:57:19 +0000 2020" "Sun May 17 02:30:46 +0000 2020" "Sat May 16 23:33:45 +0000 2020" ...
## $ hashtags : chr "" "" "" "" ...
## $ media : chr "" "" "" "" ...
## $ urls : chr "" "" "" "https://www.nbcnews.com/now/video/officials-warn-chinese-hackers-are-targeting-u-s-coronavirus-research-83422277503" ...
## $ favorite_count : int 0 0 0 0 0 0 0 0 1 1 ...
## $ id : num 1.26e+18 1.26e+18 1.26e+18 1.26e+18 1.26e+18 ...
## $ in_reply_to_screen_name : chr "" "" "" "" ...
## $ in_reply_to_status_id : num NA NA NA NA NA NA NA NA NA NA ...
## $ in_reply_to_user_id : num NA NA NA NA NA NA NA NA NA NA ...
## $ lang : chr "en" "en" "en" "en" ...
## $ place : chr "" "" "" "" ...
## $ possibly_sensitive : chr "" "" "" "false" ...
## $ quote_id : num NA NA NA NA 1.26e+18 ...
## $ retweet_count : int 25 338 441 0 0 12022 4 11 1 0 ...
## $ retweet_id : num 1.26e+18 1.26e+18 1.26e+18 NA NA ...
## $ retweet_screen_name : chr "business" "Suewilson91" "BreitbartNews" "" ...
## $ source : chr "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://www.echofon.com/\" rel=\"nofollow\">Echofon</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
## $ text : chr "Many Americans have proven diligent in staying home to limit the spread of Covid-19. But their acceptance of so"| __truncated__ "Matt Hancock accused of being a 'liar' and told to resign after claiming he 'protected' care homes from the sta"| __truncated__ "Secretary of State @MikePompeo told Breitbart News that President Donald Trump is â\200œcommittedâ\200\235 to h"| __truncated__ "Officials warn Chinese hackers are targeting U.S. coronavirus research https://t.co/7tnGhf85MS via @nbcnews" ...
## $ tweet_url : chr "https://twitter.com/lemnosalt/status/1261801422430978048" "https://twitter.com/Hilary72926522/status/1261732478529740807" "https://twitter.com/BillSpears724/status/1261846593453637632" "https://twitter.com/Bet_the_ChE/status/1261802044354113536" ...
## $ user_created_at : chr "Tue Feb 10 00:25:20 +0000 2009" "Sun Dec 01 15:12:16 +0000 2019" "Fri Jan 06 19:30:57 +0000 2017" "Sun Aug 11 19:49:05 +0000 2013" ...
## $ user_id : num 2.05e+07 1.20e+18 8.17e+17 1.66e+09 1.26e+18 ...
## $ user_default_profile_image: chr "false" "false" "false" "false" ...
## $ user_description : chr "Groovy chick and media producer. All snark. No bite." "" "" "Just some engineer." ...
## $ user_favourites_count : int 92045 19675 1 46635 2788 1371 1230 18960 4 34505 ...
## $ user_followers_count : int 1469 45 65 263 426 97 109 2151 375 12607 ...
## $ user_friends_count : int 2526 229 228 1960 267 240 274 4846 227 12722 ...
## $ user_listed_count : int 73 0 1 1 4 1 0 15 13 106 ...
## $ user_location : chr "" "New Forest" "" "United States" ...
## $ user_name : chr "Lynn" "Hilary ðŸ’\231" "Bill Spears" "Bet" ...
## $ user_screen_name : chr "lemnosalt" "Hilary72926522" "BillSpears724" "Bet_the_ChE" ...
## $ user_statuses_count : int 35678 5272 24796 23697 1028 317 279 84594 14606 252203 ...
## $ user_time_zone : logi NA NA NA NA NA NA ...
## $ user_urls : chr "http://lynnmargherita.com" "" "" "" ...
## $ user_verified : chr "false" "false" "false" "false" ...
## $ sentiment_score : num 0.31 0.00884 -0.12978 -0.21651 0.14912 ...
#identify text for max (positive) sentiment score
max(mysentiment$sentiment)
## [1] 0.9209474
maxSentiment <- sentimentResultMay2020[which.max(sentimentResultMay2020$sentiment_score),]
maxSentiment$text
## [1] "Amsterdam and Milan are both demonstrating how cities can emerge from the #COVID19 crisis stronger and more resilient than before. By embracing innovative approaches to a #GreenRecovery, we can build a more equitable and sustainable future https://t.co/Gw8Qk4Tbhc"
#identify text for min sentiment score
min(mysentiment$sentiment)
## [1] -1.376195
minSentiment <- sentimentResultMay2020[which.min(sentimentResultMay2020$sentiment_score),]
minSentiment$text
## [1] "Remember Trump's idiotic statement about too much testing showing too many infections?\n\nTrump really thinks this way\n\nHe really doesn't want more testing\n\nhttps://t.co/UJmDE0nvbP"
#write sentiment score to original dataset
write.csv(sentimentResultMay2020,'sentimentResultMay2020.csv')
#Source: https://www.tabvizexplorer.com/sentiment-analysis-using-r-and-twitter/
#score the emotions on each tweet with syuzhet as it breaks emotion into 10 different categories.
# Emotions for each tweet using NRC dictionary
emotions <- get_nrc_sentiment(importdocs)
emo_bar = colSums(emotions)
emo_sum = data.frame(count=emo_bar, emotion=names(emo_bar))
emo_sum$emotion = factor(emo_sum$emotion, levels=emo_sum$emotion[order(emo_sum$count, decreasing = TRUE)])
# visualize results to what type of emotions are dominant in the tweets
# Visualize the emotions from NRC sentiments
library(plotly)
p <- plot_ly(emo_sum, x=~emotion, y=~count, type="bar", color=~emotion) %>%
layout(xaxis=list(title=""), showlegend=FALSE,
title="Emotion Type for Covid related hastags (source: IEEE)")
p
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
#Here we see majority of the people are discussing positive about Covid.
# Create comparison word cloud data
wordcloud_tweet = c(
paste(importdocs[emotions$anger > 0], collapse=" "),
paste(importdocs[emotions$anticipation > 0], collapse=" "),
paste(importdocs[emotions$disgust > 0], collapse=" "),
paste(importdocs[emotions$fear > 0], collapse=" "),
paste(importdocs[emotions$joy > 0], collapse=" "),
paste(importdocs[emotions$sadness > 0], collapse=" "),
paste(importdocs[emotions$surprise > 0], collapse=" "),
paste(importdocs[emotions$trust > 0], collapse=" ")
)
# create corpus
corpus = Corpus(VectorSource(wordcloud_tweet))
# remove punctuation, convert every word in lower case and remove stop words
corpus = tm_map(corpus, tolower)
## Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops documents
corpus = tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus = tm_map(corpus, removeWords, c(stopwords("english")))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, c(stopwords("english"))):
## transformation drops documents
corpus = tm_map(corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
#warning: transformation drops documents
# create document term matrix
tdm = TermDocumentMatrix(corpus)
# convert as matrix
tdm = as.matrix(tdm)
tdmnew <- tdm[nchar(rownames(tdm)) < 11,]
#Graph presents which word contributes to which emotion.
# column name binding
colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust')
colnames(tdmnew) <- colnames(tdm)
comparison.cloud(tdmnew, random.order=FALSE,
colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"),
title.size=1, max.words=200, scale=c(2.4, 0.4),rot.per=0.4)
